{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/yphacker/opt/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py:3: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['title', 'location', 'department', 'salary_range', 'company_profile',\n",
      "       'description', 'requirements', 'benefits', 'telecommuting',\n",
      "       'has_company_logo', 'has_questions', 'employment_type',\n",
      "       'required_experience', 'required_education', 'industry', 'function',\n",
      "       'fraudulent'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "527d9b53ace04326b905d2188c10e336",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# 一些常规特征\n",
    "import pandas as pd\n",
    "from tqdm.autonotebook import *\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "tqdm.pandas()\n",
    "\n",
    "train = pd.read_csv('../data/train.csv')\n",
    "test = pd.read_csv('../data/test.csv')\n",
    "data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)\n",
    "print(data.columns)\n",
    "# 将所有列进行拼接\n",
    "data['review'] = data.progress_apply(lambda row:str(row['title']) + ' ' + str(row['department']) + ' ' + str(row['requirements']) + ' ' + str(row['benefits']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "unable to import 'smart_open.gcs', disabling that module\n",
      "/Users/yphacker/opt/anaconda3/envs/py36/lib/python3.6/site-packages/gensim/models/base_any2vec.py:743: UserWarning: C extension not loaded, training will be slow. Install a C compiler and reinstall gensim for fast training.\n",
      "  \"C extension not loaded, training will be slow. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "w2v model done\n"
     ]
    }
   ],
   "source": [
    "from gensim.models import Word2Vec\n",
    "from collections import defaultdict\n",
    "text_list = list(data['review'])\n",
    "\n",
    "documents = text_list\n",
    "texts = [[word for word in str(document).split(' ') ] for document in documents]\n",
    "frequency = defaultdict(int)\n",
    "for text in texts:\n",
    "    for token in text:\n",
    "        frequency[token] += 1\n",
    "texts = [[token for token in text if frequency[token] >= 1] for text in texts]\n",
    "\n",
    "\n",
    "w2v = Word2Vec(texts, size=32, seed=1017)\n",
    "w2v.wv.save_word2vec_format('model/w2v_128_extend.txt')\n",
    "print(\"w2v model done\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done\n"
     ]
    }
   ],
   "source": [
    "\n",
    "import gensim\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import numpy as np\n",
    "\n",
    "def get_w2v_avg(word2vec_Path,documents=documents):\n",
    "    texts = documents\n",
    "    w2v_dim = 32\n",
    "    model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_Path, binary=False)\n",
    "    vacab = model.vocab.keys()\n",
    "    w2v_feature = np.zeros((len(texts), w2v_dim))\n",
    "    w2v_feature_avg = np.zeros((len(texts), w2v_dim))\n",
    "\n",
    "    for i, line in enumerate(texts):\n",
    "        num = 0\n",
    "        if line == 'null':\n",
    "            w2v_feature_avg[i, :] = np.zeros(w2v_dim)\n",
    "        else:\n",
    "            for word in line.split():\n",
    "                num += 1\n",
    "                if word==\"\":\n",
    "                    vec =  np.zeros(w2v_dim)\n",
    "                else:\n",
    "                    vec = model[word] if word in vacab else np.zeros(w2v_dim)\n",
    "                w2v_feature[i, :] += vec\n",
    "            w2v_feature_avg[i, :] = w2v_feature[i, :] / num\n",
    "    w2v_avg = pd.DataFrame(w2v_feature_avg)\n",
    "    w2v_avg.columns = ['w2v_avg_' + str(i) for i in w2v_avg.columns]\n",
    "    return w2v_avg\n",
    "w2v_avg_feat=get_w2v_avg('model/w2v_128_extend.txt')\n",
    "print(\"done\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "w2v_avg_feat.to_csv('feature/w2v_extend_feature.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
